(*
   Copyright (c) 2022-2025 Semgrep Inc.

   This library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public License
   version 2.1 as published by the Free Software Foundation.

   This library is distributed in the hope that it will be useful, but
   WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the file
   LICENSE for more details.
*)
(*
   Utilities for working with the types defined in Semgrep_output_v1.atd

   TODO: Ideally the compare boilerplate would be generated by a 'deriving ord'
   in Output_from_core.atd, but atdgen does not support yet such annotations.
*)
open Common
open Semgrep_output_v1_t

(*****************************************************************************)
(* Helpers *)
(*****************************************************************************)
(* TODO: should use Common? *)

let rec last hd = function
  | [] -> hd
  | hd :: tl -> last hd tl

let first_and_last = function
  | [] -> None
  | hd :: tl -> Some (hd, last hd tl)

(*****************************************************************************)
(* File content accessors *)
(*****************************************************************************)

(* Return the list of lines for a start/end range. Note that
 * we take the whole line. Note also that each line does not contain
 * a trailing "\n" so you may need to String.concat "\n" if you join them.
 *
 * TODO: at some point we should take a Parse_info range, not Out.position
 *  (but in that case don't forget to use Parse_info.get_token_end_info end_)
 * TODO: could be moved to another helper module.
 *
 * Should we use a faster implementation, using a cache
 * to avoid rereading the same file again and again?
 * Use weak hashtable or LRU cache?
 * Or maybe fast enough like this thanks to OS buffer cache?
 *
 * python: # 'lines' already contains '\n' at the end of each line
 *   lines="".join(rule_match.lines).rstrip(),
 *)
let lines_of_file_at_range (range : position * position) (file : Fpath.t) :
    (string list, string) result =
  let start, end_ = range in
  UFile.lines_of_file (start.line, end_.line) file
[@@profiling]

(* Returns the text between the positions; start inclusive, end exclusive.
 * TODO: same than above, ideally would take a Parse_info range
 * TOPORT: It is recommended to open the fd with `open(path, errors="replace")
 * to ignore non-utf8 bytes.
 * See https://stackoverflow.com/a/56441652.
 *)
let content_of_file_at_range (range : position * position) (file : Fpath.t) :
    string =
  let start, end_ = range in
  let str = UFile.read_file file in
  String.sub str start.offset (end_.offset - start.offset)
[@@profiling]

(*****************************************************************************)
(* Tok.t to Out.location *)
(*****************************************************************************)

(* pfff (and Emacs) have the first column at index 0, but not r2c *)
let adjust_column x = x + 1

let position_of_token_location loc =
  {
    line = loc.Loc.pos.line;
    col = adjust_column loc.Loc.pos.column;
    offset = loc.Loc.pos.bytepos;
  }

let position_range min_loc max_loc =
  let end_line, end_col, end_charpos = Loc.end_pos max_loc in
  (* alt: could call position_of_token_location but more symetric like that*)
  ( {
      line = min_loc.Loc.pos.line;
      col = adjust_column min_loc.Loc.pos.column;
      offset = min_loc.Loc.pos.bytepos;
    },
    { line = end_line; col = adjust_column end_col; offset = end_charpos } )

let location_of_token_location loc =
  let start, end_ = position_range loc loc in
  { path = loc.Loc.pos.file; start; end_ }

(* None if pi has no location information. Fake tokens should have been
 * filtered out earlier, but in case one slipped through we handle this case.
 *)
let parse_info_to_location pi =
  Tok.loc_of_tok pi |> Result.to_option |> Option.map location_of_token_location

let tokens_to_locations toks = List_.filter_map parse_info_to_location toks

let tokens_to_single_loc (toks : Tok.t list) : location option =
  (* toks should be nonempty and should contain only origintoks, but since we
   * can't prove that by construction we have to filter and handle the empty
   * case here. In theory this could lead to, e.g. a missing taint source for a
   * taint rule finding but it shouldn't happen in practice. *)
  let locations =
    tokens_to_locations
      (List.filter Tok.is_origintok toks |> List.sort Tok.compare_pos)
  in
  let* first_loc, last_loc = first_and_last locations in
  Some { path = first_loc.path; start = first_loc.start; end_ = last_loc.end_ }

(*****************************************************************************)
(* Compare (=~ deriving ord) *)
(*****************************************************************************)

(* alt: it could be good to automatically derive those, but Iago got
 * bitten in the past and prefer explicit code.
 *)
let compare_position (a : position) b = Int.compare a.offset b.offset

let compare_location (a : location) (b : location) =
  let c = Fpath.compare a.path b.path in
  if c <> 0 then c
  else
    let c = compare_position a.start b.start in
    if c <> 0 then c else compare_position a.end_ b.end_

let compare_metavar_value (a : metavar_value) (b : metavar_value) =
  let c = compare_position a.start b.start in
  if c <> 0 then c else compare_position a.end_ b.end_

(* Generic list comparison. The input lists must already be sorted according
   to 'compare_elt'.

   [1] < [2]
   [1] < [1; 2]
   [1; 2] < [2]
*)
let rec compare_sorted_list compare_elt a b =
  match (a, b) with
  | [], [] -> 0
  | [], _ :: _ -> -1
  | _ :: _, [] -> 1
  | a :: aa, b :: bb ->
      let c = compare_elt a b in
      if c <> 0 then c else compare_sorted_list compare_elt aa bb

(*
   Order the metavariable bindings by location first, then by name.
   (could go the other way too; feel free to change)
*)
let compare_metavar_binding (name1, mv1) (name2, mv2) =
  let c = compare_metavar_value mv1 mv2 in
  if c <> 0 then c else String.compare name1 name2

(* Assumes the metavariable captures within each match_extra are already
   sorted. *)
let compare_match_extra (a : core_match_extra) (b : core_match_extra) =
  let c = compare_sorted_list compare_metavar_binding a.metavars b.metavars in
  if c <> 0 then c else compare a.message b.message

(*
   While the locations of the matches are already in correct order, they
   come in a reverse order when looking at the metavariables that they
   match. This function makes a best a effort to return the results
   in a natural order.
*)
let compare_match (a : core_match) (b : core_match) =
  let (aloc : location) = { path = a.path; start = a.start; end_ = a.end_ } in
  let (bloc : location) = { path = b.path; start = b.start; end_ = b.end_ } in

  let c = compare_location aloc bloc in
  if c <> 0 then c else compare_match_extra a.extra b.extra

let sort_metavars (metavars : (string * metavar_value) list) =
  List.stable_sort compare_metavar_binding metavars

let sort_extra (extra : core_match_extra) =
  { extra with metavars = sort_metavars extra.metavars }

let sort_core_matches (matches : core_match list) : core_match list =
  let matches =
    matches
    |> List_.map (fun (x : core_match) -> { x with extra = sort_extra x.extra })
  in
  List.stable_sort compare_match matches

(*
   Order by:
     file path, start line, start column, end line, end column, rule name

   This uses the same ordering as in pysemgrep in RuleMatch.get_ordering_key()
*)
let compare_cli_matches (a : cli_match) (b : cli_match) =
  let c = Fpath.compare a.path b.path in
  if c <> 0 then c
  else
    let a_start = a.start in
    let b_start = b.start in
    let c = Int.compare a_start.line b_start.line in
    if c <> 0 then c
    else
      let c = Int.compare a_start.col b_start.col in
      if c <> 0 then c
      else
        let a_end = a.end_ in
        let b_end = b.end_ in
        let c = Int.compare a_end.line b_end.line in
        if c <> 0 then c
        else
          let c = Int.compare a_end.col b_end.col in
          if c <> 0 then c else Rule_ID.compare a.check_id b.check_id

let sort_cli_matches xs = List.sort compare_cli_matches xs
